This report analyzes Airbnb listings in Crete, Greece including price distribution, availability, and location trends.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(readr)
Firstly, I loaded the Airbnb dataset, converted the date column to a date format and then cleaned the price column by removing symbols and converting it to numeric.
airbnb_calendar <- read.csv("/Users/christosfacondis/Downloads/calendar.csv")
airbnb_calendar$date<- as.Date(airbnb_calendar$date, format="%Y-%m-%d")
airbnb_calendar$price <- as.numeric(gsub("[$€,]", "", airbnb_calendar$price))
str(airbnb_calendar)
## 'data.frame': 9545448 obs. of 7 variables:
## $ listing_id : num 8.94e+17 8.94e+17 8.94e+17 8.94e+17 8.94e+17 ...
## $ date : Date, format: "2024-12-29" "2024-12-30" ...
## $ available : chr "f" "t" "t" "t" ...
## $ price : num 44 44 44 44 44 44 44 44 44 44 ...
## $ adjusted_price: chr "" "" "" "" ...
## $ minimum_nights: int 2 2 2 2 2 2 2 2 2 2 ...
## $ maximum_nights: int 365 365 365 365 365 365 365 365 365 365 ...
summary(airbnb_calendar)
## listing_id date available price
## Min. :2.797e+04 Min. :2024-12-29 Length:9545448 Min. : 0.0
## 1st Qu.:2.807e+07 1st Qu.:2025-03-30 Class :character 1st Qu.: 69.0
## Median :5.313e+07 Median :2025-06-29 Mode :character Median : 110.0
## Mean :4.572e+17 Mean :2025-06-29 Mean : 285.6
## 3rd Qu.:9.240e+17 3rd Qu.:2025-09-28 3rd Qu.: 222.0
## Max. :1.321e+18 Max. :2025-12-29 Max. :75625.0
##
## adjusted_price minimum_nights maximum_nights
## Length:9545448 Min. : 1.000 Min. :1.000e+00
## Class :character 1st Qu.: 2.000 1st Qu.:3.650e+02
## Mode :character Median : 3.000 Median :1.125e+03
## Mean : 4.577 Mean :8.285e+04
## 3rd Qu.: 4.000 3rd Qu.:1.125e+03
## Max. :999.000 Max. :2.147e+09
## NA's :4 NA's :4
In this dataset i converted some columns into factor.
airbnb_listings = read.csv("/Users/christosfacondis/Downloads/listings.csv")
airbnb_listings$neighbourhood <- as.factor(airbnb_listings$neighbourhood)
airbnb_listings$room_type <- as.factor(airbnb_listings$room_type)
str(airbnb_listings)
## 'data.frame': 26152 obs. of 18 variables:
## $ id : num 8.94e+17 8.94e+17 8.94e+17 8.94e+17 8.94e+17 ...
## $ name : chr "Amaria Studio" "BH417 - R - Villa Chania" "Mylos home" "Gouves family house in Heraklion" ...
## $ host_id : int 514918089 461193921 514261598 515317369 461193921 515332334 172461966 147757667 515377223 104219138 ...
## $ host_name : chr "Amalia" "The Best Homes Md Lp" "Ευα" "Dimitris" ...
## $ neighbourhood_group : logi NA NA NA NA NA NA ...
## $ neighbourhood : Factor w/ 24 levels "Αγίου Βασιλείου",..: 10 23 11 24 23 13 19 23 23 19 ...
## $ latitude : num 35.3 35.5 35 35.3 35.5 ...
## $ longitude : num 25.1 23.9 25.7 25.3 23.9 ...
## $ room_type : Factor w/ 4 levels "Entire home/apt",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ price : int 45 NA NA 138 NA NA NA 62 NA 35 ...
## $ minimum_nights : int 2 7 2 3 3 4 4 2 3 2 ...
## $ number_of_reviews : int 12 0 6 0 0 3 6 14 7 6 ...
## $ last_review : chr "2024-10-28" "" "2024-08-29" "" ...
## $ reviews_per_month : num 0.63 NA 0.34 NA NA 0.16 0.36 0.76 0.36 0.83 ...
## $ calculated_host_listings_count: int 2 183 1 1 183 1 3 2 1 2 ...
## $ availability_365 : int 177 184 0 0 207 0 0 364 0 196 ...
## $ number_of_reviews_ltm : int 7 0 3 0 0 1 5 6 3 6 ...
## $ license : chr "2031039" "1042K91003049701" "00002080199" "00001245657" ...
summary(airbnb_listings)
## id name host_id host_name
## Min. :2.797e+04 Length:26152 Min. : 51279 Length:26152
## 1st Qu.:2.807e+07 Class :character 1st Qu.: 55004122 Class :character
## Median :5.313e+07 Mode :character Median :184448975 Mode :character
## Mean :4.572e+17 Mean :231143302
## 3rd Qu.:9.240e+17 3rd Qu.:414440198
## Max. :1.321e+18 Max. :668568866
##
## neighbourhood_group neighbourhood latitude longitude
## Mode:logical Χανίων :5785 Min. :34.83 Min. :23.53
## NA's:26152 Ρεθύμνης :3004 1st Qu.:35.28 1st Qu.:24.02
## Χερσονήσου :2152 Median :35.36 Median :24.49
## Ηρακλείου :2091 Mean :35.35 Mean :24.62
## Αποκορώνου :1758 3rd Qu.:35.50 3rd Qu.:25.14
## Αγίου Νικολάου:1491 Max. :35.59 Max. :26.28
## (Other) :9871
## room_type price minimum_nights number_of_reviews
## Entire home/apt:23518 Min. : 10.0 Min. : 1.000 Min. : 0.00
## Hotel room : 266 1st Qu.: 65.0 1st Qu.: 2.000 1st Qu.: 1.00
## Private room : 2353 Median : 100.0 Median : 3.000 Median : 7.00
## Shared room : 15 Mean : 250.7 Mean : 5.429 Mean : 19.48
## 3rd Qu.: 200.0 3rd Qu.: 4.000 3rd Qu.: 23.00
## Max. :72942.0 Max. :999.000 Max. :605.00
## NA's :2998
## last_review reviews_per_month calculated_host_listings_count
## Length:26152 Min. : 0.010 Min. : 1.00
## Class :character 1st Qu.: 0.170 1st Qu.: 1.00
## Mode :character Median : 0.380 Median : 4.00
## Mean : 0.585 Mean : 25.87
## 3rd Qu.: 0.770 3rd Qu.: 12.00
## Max. :10.790 Max. :283.00
## NA's :5131
## availability_365 number_of_reviews_ltm license
## Min. : 0.0 Min. : 0.000 Length:26152
## 1st Qu.:144.0 1st Qu.: 0.000 Class :character
## Median :225.0 Median : 2.000 Mode :character
## Mean :215.2 Mean : 4.543
## 3rd Qu.:322.0 3rd Qu.: 6.000
## Max. :365.0 Max. :141.000
##
airbnb_neighbourhoods = read.csv("/Users/christosfacondis/Downloads/neighbourhoods.csv")
str(airbnb_neighbourhoods)
## 'data.frame': 24 obs. of 2 variables:
## $ neighbourhood_group: logi NA NA NA NA NA NA ...
## $ neighbourhood : chr "Αγίου Βασιλείου" "Αγίου Νικολάου" "Αμάριου" "Ανωγείων" ...
summary(airbnb_neighbourhoods)
## neighbourhood_group neighbourhood
## Mode:logical Length:24
## NA's:24 Class :character
## Mode :character
airbnb_reviews = read.csv("/Users/christosfacondis/Downloads/reviews.csv")
str(airbnb_reviews)
## 'data.frame': 509419 obs. of 2 variables:
## $ listing_id: num 27966 27966 27966 27966 27966 ...
## $ date : chr "2011-09-02" "2012-04-06" "2012-07-05" "2012-08-04" ...
summary(airbnb_reviews)
## listing_id date
## Min. :2.797e+04 Length:509419
## 1st Qu.:1.614e+07 Class :character
## Median :2.762e+07 Mode :character
## Mean :1.493e+17
## 3rd Qu.:4.965e+07
## Max. :1.317e+18
I began by organising the listings data by neighbourhood to identify the areas with the most listings. I then created a plot to visualise the neighbourhoods with the most Airbnb listings.
neighbourhood_density = airbnb_listings %>%
group_by(neighbourhood) %>%
summarise(total_listings = n()) %>%
arrange(desc(total_listings))
head(neighbourhood_density)
## # A tibble: 6 × 2
## neighbourhood total_listings
## <fct> <int>
## 1 Χανίων 5785
## 2 Ρεθύμνης 3004
## 3 Χερσονήσου 2152
## 4 Ηρακλείου 2091
## 5 Αποκορώνου 1758
## 6 Αγίου Νικολάου 1491
ggplot(neighbourhood_density[1:10, ], aes(x=reorder(neighbourhood, total_listings), y=total_listings)) +
geom_bar(stat="identity", fill="skyblue", alpha=0.8) +
coord_flip() +
labs(title="Top 10 Neighbourhoods with Most Airbnb Listings",
x="Neighbourhood", y="Number of Listings") +
theme_minimal() +
theme(axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
plot.title = element_text(size=14, face="bold"))
Firstly, i cleaned my data from NAs,zeros and not finite values. Then, I grouped the prices of the listings by neighbourhood. I then requested the following aggregations (mean, median, standard deviation, size) for each neighbourhood. Finally, I sorted the calculated data by median in descending order beacuse Airbnb prices don’t follow a normal distribution.
sum(is.na(airbnb_listings$price))
## [1] 2998
sum(is.nan(airbnb_listings$price))
## [1] 0
sum(is.infinite(airbnb_listings$price))
## [1] 0
sum(airbnb_listings$price <= 0, na.rm=TRUE)
## [1] 0
airbnb_listings<- airbnb_listings %>%
mutate(price = as.numeric(price)) %>%
filter(!is.na(price) & !is.nan(price) & is.finite(price) & price > 0)
prices_by_neighbourhood <- airbnb_listings %>%
group_by(neighbourhood) %>%
summarise(
mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE),
sd_price = sd(price, na.rm = TRUE),
total_listings = n()
) %>%
arrange(desc(median_price))
head(prices_by_neighbourhood, 10)
## # A tibble: 10 × 5
## neighbourhood mean_price median_price sd_price total_listings
## <fct> <dbl> <dbl> <dbl> <int>
## 1 Μυλοποτάμου 425. 198 1223. 665
## 2 Αποκορώνου 287. 170 571. 1596
## 3 Ρεθύμνης 283. 132 779. 2636
## 4 Πλατανιά 270. 130 905. 1147
## 5 Αγίου Βασιλείου 208. 109 634. 1044
## 6 Κισσάμου 352. 109 1030. 1282
## 7 Αγίου Νικολάου 391. 100 1251. 1324
## 8 Φαιστού 206. 100 798. 932
## 9 Χερσονήσου 357. 100 1478. 1895
## 10 Χανίων 205. 97 1146. 4996
I selected the top 10 most expensive neighbourhoods and created a dot-and-line plot. I plotted the median price in red and the mean price in blue, connecting them with a dashed gray line.
expensive_neighbourhoods <- prices_by_neighbourhood[1:10, ]
# Create a dot + line plot with both median and mean prices
ggplot(expensive_neighbourhoods, aes(x=reorder(neighbourhood, median_price))) +
geom_point(aes(y=median_price), color="red", size=5, alpha=0.8) +
geom_point(aes(y=mean_price), color="blue", size=5, alpha=0.8) +
geom_segment(aes(y=median_price, yend=mean_price, xend=neighbourhood), color="gray", linetype="dashed") + # Line connecting mean and median
coord_flip() + # Flip for better readability
labs(title="Top 10 Neighbourhoods by Airbnb Price (Median vs Mean)",
subtitle="Red = Median Price | Blue = Mean Price",
x="Neighbourhood",
y="Price") +
theme_minimal() +
theme(axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
plot.title = element_text(size=14, face="bold"),
plot.subtitle = element_text(size=12, face="italic"))
I aimed to visualize the distribution of Airbnb listing prices in Crete by creating a histogram that displays the frequency of different price amounts while also including reference lines for the mean and median prices.
airbnb_listings <- airbnb_listings %>%
mutate(price = as.numeric(price)) %>% # Convert to numeric
filter(!is.na(price) & !is.nan(price) & is.finite(price) & price > 0)
ggplot(airbnb_listings, aes(x=price)) +
geom_histogram(binwidth=10, fill="navy", color="black", alpha=0.7) +
geom_vline(aes(xintercept=mean(price, na.rm=TRUE)), color="red", linetype="solid", size=1.2) +
geom_vline(aes(xintercept=median(price, na.rm=TRUE)), color="black", linetype="solid", size=1.2) +
scale_x_continuous(limits = c(0, quantile(airbnb_listings$price, 0.95, na.rm=TRUE)), breaks=seq(0, max(airbnb_listings$price, na.rm=TRUE), by=100)) + # Limit x-axis to remove extreme outliers
labs(title="Histogram of Airbnb Prices in Greece",
x="Price Amounts", y="Frequency",
subtitle="Red Line = Mean Price | Black Line = Median Price") +
theme_minimal() + # Clean and modern theme
theme(axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
plot.title = element_text(size=14, face="bold"),
plot.subtitle = element_text(size=12, face="italic"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1120 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
###Question 3b Average price per Date
I calculated the average price per date by grouping the data by date and computing the mean price. Then, I created a line plot, where the average price. I formatted the x-axis to display monthly date labels.
avg_price_per_date <- airbnb_calendar %>%
group_by(date) %>%
summarise(AvgPrice = mean(price, na.rm=TRUE)) %>%
arrange(date)
ggplot(avg_price_per_date, aes(x = as.Date(date), y = AvgPrice)) +
geom_line(color="blue", size=1)
labs(title = "Average Price Per Date",
x = "Date",
y = "Average Price (€)") +
theme_minimal() + # Clean and modern theme
theme(axis.text.x = element_text(size=12, angle=45, hjust=1), # Rotate x-axis labels
axis.text.y = element_text(size=12),
plot.title = element_text(size=14, face="bold")) +
scale_x_date(date_labels = "%b %d", date_breaks = "1 month")
## NULL
I aimed to analyze and visualize the availability of Airbnb listings per day by filtering and counting available listings, grouping them by date, and plotting the results. When plotting this available, i observe that the most available listings can be found May to October.
availability_per_date <- airbnb_calendar %>%
filter(available == 't') %>% # Keep only available listings
group_by(date) %>% # Group by date
summarise(no_listings_available = n()) %>% # Count available listings per day
arrange(date) # Sort by date
ggplot(availability_per_date, aes(x = as.Date(date), y = no_listings_available)) +
geom_line(color="blue", size=1) + # Line plot with blue color
labs(title = "Available Listings per Date",
x = "Date",
y = "Number of Available Listings") +
theme_minimal()
I aimed to analyze and visualize the distribution of reviews per Airbnb listing in Crete by creating a histogram that displays the frequency of different review counts while also including reference lines for the mean and median.
mean_reviews <- mean(airbnb_listings$number_of_reviews, na.rm=TRUE)
median_reviews <- median(airbnb_listings$number_of_reviews, na.rm=TRUE)
ggplot(airbnb_listings, aes(x=number_of_reviews)) +
geom_histogram(bins=200, fill="navy", color="black", alpha=0.7) +
geom_vline(aes(xintercept=mean_reviews), color="red", linewidth=1.2) +
geom_vline(aes(xintercept=median_reviews), color="black", linewidth=1.2) +
scale_x_continuous(limits=c(0,90), breaks=seq(0, 90, 10)) +
labs(title="Histogram of Number of Reviews per listing",
x="Review Amounts",
y="Frequency") +
theme_minimal() +
theme(plot.title = element_text(size=14, face="bold"),
axis.title.x = element_text(size=12, face="bold"),
axis.title.y = element_text(size=12, face="bold"),
axis.text.x = element_text(angle=45, hjust=1)) +
guides(fill="none") +
annotate("text", x=mean_reviews, y=max(table(airbnb_listings$number_of_reviews)), label="Mean", color="red", hjust=-0.2) +
annotate("text", x=median_reviews, y=max(table(airbnb_listings$number_of_reviews))-10, label="Median", color="black", hjust=-0.2)
## Warning: Removed 940 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
I analyzed the distribution of different Airbnb room types in Crete to understand how prices vary and how common each room type is. I used a bar plot, violin plot, and frequency analysis to explore these insights.
df_room_type <- airbnb_listings %>%
group_by(room_type) %>%
summarise(no_listings_per_room_type = n()) %>%
arrange(desc(no_listings_per_room_type))
total_listings <- sum(df_room_type$no_listings_per_room_type)
df_room_type <- df_room_type %>%
mutate(percentage = round((no_listings_per_room_type / total_listings) * 100, 2))
df_room_type$percentage <- paste0(df_room_type$percentage, "%")
print(df_room_type)
## # A tibble: 4 × 3
## room_type no_listings_per_room_type percentage
## <fct> <int> <chr>
## 1 Entire home/apt 21067 90.99%
## 2 Private room 1864 8.05%
## 3 Hotel room 210 0.91%
## 4 Shared room 13 0.06%
ggplot(df_room_type, aes(x=reorder(room_type, -no_listings_per_room_type),
y=no_listings_per_room_type,
fill=room_type)) +
geom_bar(stat="identity", alpha=0.8) +
geom_text(aes(label=percentage), vjust=-0.5, size=2) +
labs(title="Number of Listings per Room Type",
x="Room Type",
y="Number of Listings") +
theme_minimal() + # Clean theme
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=10, face="bold")) +
scale_fill_manual(values=c("#FFB6C1", "#ADD8E6", "#FFDDC1", "#C1E1C1"))
listings <- airbnb_listings %>%
mutate(price = as.numeric(price)) %>%
filter(!is.na(price) & price > 0 & price < quantile(price, 0.95, na.rm=TRUE))
ggplot(listings, aes(x=room_type, y=price, fill=room_type)) +
geom_violin(alpha=0.7, trim=FALSE, color="black") + # Violin plot with border
geom_boxplot(width=0.1, fill="white", alpha=0.5, outlier.shape=NA) +
labs(title="Price Distribution by Room Type",
x="Room Type",
y="Price") +
theme_minimal() +
theme(axis.text.x = element_text(size=12, angle=45, hjust=1),
axis.text.y = element_text(size=12),
plot.title = element_text(size=14, face="bold")) +
scale_fill_manual(values=c("#FFB6C1", "#ADD8E6", "#FFDDC1", "#C1E1C1"))
I aimed to analyze and visualize the geographic distribution of Airbnb listings in Crete using an interactive map. This map allows users to explore locations dynamically, view listing details, and see how room types are distributed geographically.
library(leaflet)
library(RColorBrewer)
room_colors <- colorFactor(
palette = c("red", "blue", "green", "purple"),
domain = airbnb_listings$room_type)
leaflet(airbnb_listings) %>%
addTiles() %>%
addCircleMarkers(
~longitude, ~latitude,
color = ~room_colors(room_type),
radius = 3, opacity = 0.7, fillOpacity = 0.5,
popup = ~paste("<b>Room Type:</b>", room_type,
"<br><b>Price:</b>", price, "€",
"<br><b>Reviews:</b>", number_of_reviews)
) %>%
addLegend("bottomright",
pal = room_colors, values = ~room_type,
title = "Room Type")
In this question, i have checked out how many listings are per host.
listings_per_host <- airbnb_listings %>%
group_by(host_id, host_name) %>%
summarise(no_of_listings = n()) %>%
arrange(desc(no_of_listings))
## `summarise()` has grouped output by 'host_id'. You can override using the
## `.groups` argument.
head(listings_per_host)
## # A tibble: 6 × 3
## # Groups: host_id [6]
## host_id host_name no_of_listings
## <int> <chr> <int>
## 1 4301312 Valia 276
## 2 12389816 Kostas 259
## 3 61036077 Stratos 222
## 4 40567005 Emmanuel & Yannis 179
## 5 461193921 The Best Homes Md Lp 153
## 6 9114389 Antonis 141